rm(list=ls()) # clear workspace

# set working directory
setwd("C:/Users/cailin.slattery/Dropbox/Incentives/postJM/Replication")
library(MASS)
library(ggplot2)
library(ORDER2PARENT)
library(haven)
library(bbmle)
library(psych)
library(mvtnorm)
library(dplyr)
library(stringr)
library(tidyr)
library(copula)
library(tidyverse)

set.seed(31389) 
theme_set(theme_classic())

#-------------------------------------------------------------------------------#
# Helper functions for MLE

# Creates data matrix for varlist + interaction terms
create_mat <- function(data, vars) {
    i <- 1
    for (v in vars) {
        if (grepl(":", v, fixed = TRUE)) {
            x <- str_split(v, ":", simplify = TRUE)
            if (i == 1) {
                X <- data[, x[1]] * data[, x[2]]
            } else {
                X[, v] <- data[, x[1]] * data[, x[2]]
            }
        } else {
            if (i == 1) {
                X <- data[, v]
            } else {
                X[, v] <- data[, v]
            }
        }
        i <- i + 1
    }
    data.matrix(X)
}

# Creates variable names for random coefficients
get_varnames_rand <- function(rand_vars) {
    sapply(rand_vars, function(x) {paste("sigma", x, sep="_")})
}

# Runs linear model with varlist as input
run_lm <- function(data, vars) {
    vars_full <- c(0, vars)
    f <- as.formula(paste("sub_M", paste(vars_full, collapse = " + "), sep = " ~ "))
    lm(f, data=data)
}

# Runs MLE with random coefficients
run_mle_rand <- function(data, rand_obs, vars, rand_vars, method = "BFGS"){
    
    LL_rand <- function(params) {
        b <- params[reg_param_names]
        b_rand <- params[rand_param_names]
        sigma <- params['sigma']
        
        resid <- data$sub_M - X %*% b
        resid <- matrix(resid, nrow = nrow(resid), ncol = n_rand)
        rand_prod <- X_rand %*% b_rand
        rand_prod <- matrix(rand_prod, nrow = nrow(resid), ncol = n_rand)
        resid <- resid - rand_prod * rand_obs
        
        resid <- suppressWarnings(dnorm(resid, 0, sigma))
        
        mean_resid <- rowMeans(resid)
        -sum(log(mean_resid))
    }
    
    lm <- run_lm(data, vars)
    
    reg_param_names <- names(coef(lm))
    rand_param_names <- get_varnames_rand(rand_vars)
    param_names <- c(reg_param_names, rand_param_names, 'sigma')
    
    coef <- coef(lm)
    
    start_vals <- coef
    start_vals[rand_param_names] <- 5
    start_vals['sigma'] <- summary(lm)$sigma
    
    X <- create_mat(data, names(coef))
    X_rand <- create_mat(data, rand_vars)
    
    parnames(LL_rand) <- param_names
    mle <- mle2(minuslogl = LL_rand, start = start_vals,
                method = method, vecpar = TRUE, control = list(maxit = 5000))
    list("lm" = lm, "mle" = mle)
}

#-------------------------------------------------------------------------------#
# Filter data + create dummies

df <- read_stata("data/analysis_cz.dta")
df <- filter(df, ru_limit == 1) # filter to only include CZs with runner-ups

df$manuf_type_1 <- as.integer(df$manuf_type == 1)
df$manuf_type_3 <- as.integer(df$manuf_type == 3)

df$trade_0 <- as.integer(df$trade == 0)
df$trade_1 <- as.integer(df$trade == 1)

df <- filter(df, sub_M < 1000)

# sample one runner-up within subsidy ID
set.seed(1)
df <- df %>%
    group_by(id_win) %>%
    slice_sample(n = 1)

# Generate random variables
set.seed(1)
n_rand = 1000
rand_obs = matrix(nrow = nrow(df), ncol = n_rand)
for (i in 1:n_rand) {
    rand_obs[, i] <- rnorm(nrow(df))
}
#-------------------------------------------------------------------------------#
### RUN MLE FOR MANUFACTURING

cur_df <- filter(df, manuf == 1)
cur_rand_obs <- rand_obs[df$manuf == 1, ]

profit_vars <- c("diff_corp_tax", 
                 "diff_income_tax",
                 "diff_proptax",
                 "diff_wage",
                 "diff_occ_top2_naics3",
                 "diff_occ_top2_naics3:jobs_direct",
                 "diff_pr_college:manuf_type_1",
                 "diff_pr_college:manuf_type_3",
                 "diff_perc_est_n4:manuf_type_1",
                 "diff_corp_tax:invest_B",
                 "diff_e_ind:invest_B",
                 "diff_indus_zoning:invest_B",
                 "diff_auto_roadnetwork:manuf_type_3") 

profit_vars_win <- c("corp_tax_win", 
                     "income_tax_win",
                     "proptax_win",
                     "wage_win",
                     "occ_top2_naics3_win",
                     "occ_top2_naics3_win:jobs_direct",
                     "pr_college_win:manuf_type_1",
                     "pr_college_win:manuf_type_3",
                     "perc_est_n4_win:manuf_type_1",
                     "corp_tax_win:invest_B",
                     "e_ind_win:invest_B",
                     "indus_zoning_win:invest_B",
                     "auto_roadnetwork_win:manuf_type_3") 

profit_vars_ru <- c("corp_tax", 
                    "income_tax",
                    "proptax",
                    "wage",
                    "occ_top2_naics3",
                    "occ_top2_naics3:jobs_direct",
                    "pr_college:manuf_type_1",
                    "pr_college:manuf_type_3",
                    "perc_est_n4:manuf_type_1",
                    "corp_tax:invest_B",
                    "e_ind:invest_B",
                    "indus_zoning:invest_B",
                    "auto_roadnetwork:manuf_type_3")

v_vars <- c("jobs_direct",
            "mult_jobs",
            "invest_B",
            "income_tax",
            "corp_tax",
            "term_limit",
            "incwage_top_5",
            "unemp",
            "jobs_direct:unemp",
            "ln_pinc",
            "incwage_top_5:ln_pinc")

# variables for random coefficients
rand_vars <- c("diff_corp_tax",
                      "diff_income_tax", 
                      "diff_wage",
                      "diff_occ_top2_naics3")

# combine in matrix 
X <- create_mat(cur_df, c(profit_vars, v_vars, rand_vars))
na_filter <- rowSums(is.na(X)) == 0
cur_df <- cur_df[na_filter, ]
cur_rand_obs <- cur_rand_obs[na_filter, ]

# RUN MLE
vars <- c(profit_vars, v_vars)
res <- run_mle_rand(cur_df, cur_rand_obs, vars, rand_vars)

## regression coefficient tables
# export to csv so can make table for paper:
mle_tab <- coef(summary(res$mle))[, 1:2]
write.csv(mle_tab, "data/mle_coef_manuf.csv")

#-------------------------------------------------------------------------------#
## Calculate profits, welfare, and valuations; output figs + summary stats
mle <- res$mle

# calculate residuals
reg_coef <- coef(mle)[!grepl("sigma", names(coef(mle)))]
X <- create_mat(cur_df, names(reg_coef))
resid <- cur_df$sub_M - (X %*% reg_coef)

profit_vars_mle <- coef(mle)[grepl("diff", names(coef(mle))) & !grepl("sigma", names(coef(mle)))]
v_vars_mle <- coef(mle)[!grepl("diff", names(coef(mle)))  & !grepl("sigma", names(coef(mle)))]

# Win profits
X <- create_mat(cur_df, profit_vars_win)
profits_win <- X %*% profit_vars_mle

# Ru profits
X <- create_mat(cur_df, profit_vars_ru)
profits_ru <- X %*% profit_vars_mle

# Ru valuation
X <- create_mat(cur_df, v_vars)
v_ru <- X %*% v_vars_mle

# subtract min profits + calculate welfare
min_profit_win <- min(profits_win, profits_ru)
profits_win_orig <- profits_win
profits_win <- profits_win - min_profit_win 
profits_ru_orig <- profits_ru
profits_ru <- profits_ru - min_profit_win 
welfare_ru_orig <- profits_ru_orig + v_ru + resid
welfare_ru <- profits_ru + v_ru + resid
v_ru_resid <- v_ru + resid

# combine in a matrix 
mat <- cbind(profits_win, profits_win_orig, profits_ru, profits_ru_orig,
             v_ru, v_ru_resid, welfare_ru, welfare_ru_orig)
colnames(mat) <- c("profits_win", "profits_win_orig", "profits_ru", "profits_ru_orig",
                   "v_ru", "v_ru_resid", "welfare_ru", "welfare_ru_orig")

#valuation of runner-up, conditional on chars
v_ru_cond <- v_ru -
    coef(mle)['jobs_direct'] * cur_df$jobs_direct -
    coef(mle)['mult_jobs'] * cur_df$mult_jobs -
    coef(mle)['invest_B'] * cur_df$invest_B +
    coef(mle)['jobs_direct'] * mean(cur_df$jobs_direct) +
    coef(mle)['mult_jobs'] * mean(cur_df$mult_jobs)  +
    coef(mle)['invest_B'] * mean(cur_df$invest_B) 
v_ru_cond_resid = v_ru_cond + resid
v_ru_cond_resid[v_ru_cond_resid<0] <-0

#profit of runner-up, conditional on chars
profits_ru_cond <- profits_ru -
  coef(mle)['diff_occ_top2_naics3:jobs_direct'] * (cur_df$occ_top2_naics3 * cur_df$jobs_direct) +
  coef(mle)['diff_occ_top2_naics3:jobs_direct'] * cur_df$occ_top2_naics3 * mean(cur_df$jobs_direct) -
  coef(mle)['invest_B:diff_indus_zoning'] * (cur_df$indus_zoning * cur_df$invest_B) +
  coef(mle)['invest_B:diff_indus_zoning'] * cur_df$indus_zoning * mean(cur_df$invest_B) -
  coef(mle)['diff_corp_tax:invest_B'] * (cur_df$corp_tax * cur_df$invest_B) +
  coef(mle)['diff_corp_tax:invest_B'] * cur_df$corp_tax * mean(cur_df$invest_B) -
  coef(mle)['invest_B:diff_e_ind'] * (cur_df$e_ind * cur_df$invest_B) +
  coef(mle)['invest_B:diff_e_ind'] * cur_df$e_ind * mean(cur_df$invest_B)

#profit of winner, conditional on chars
profits_win_cond <- profits_win -
  coef(mle)['diff_occ_top2_naics3:jobs_direct'] * (cur_df$occ_top2_naics3_win * cur_df$jobs_direct) +
  coef(mle)['diff_occ_top2_naics3:jobs_direct'] * cur_df$occ_top2_naics3_win * mean(cur_df$jobs_direct) -
  coef(mle)['invest_B:diff_indus_zoning'] * (cur_df$indus_zoning_win * cur_df$invest_B) +
  coef(mle)['invest_B:diff_indus_zoning'] * cur_df$indus_zoning_win * mean(cur_df$invest_B) -
  coef(mle)['diff_corp_tax:invest_B'] * (cur_df$corp_tax_win * cur_df$invest_B) +
  coef(mle)['diff_corp_tax:invest_B'] * cur_df$corp_tax_win * mean(cur_df$invest_B) -
  coef(mle)['invest_B:diff_e_ind'] * (cur_df$e_ind_win * cur_df$invest_B) +
  coef(mle)['invest_B:diff_e_ind'] * cur_df$e_ind_win * mean(cur_df$invest_B)

## export for figures: 
corr <- cbind(profits_ru_cond, profits_ru, v_ru, v_ru_cond, v_ru_cond_resid, v_ru_resid, cur_df$id_win)
write.csv(corr, "data/runner_up_correlations.csv")

welfare_ru_cond <- profits_ru_cond + v_ru_cond_resid

###################################################################################
# distribution of profits with and without random coefs, also conditional on jobs, investment.
###################################################################################
rand_vars_win <- c("corp_tax_win", "income_tax_win", "wage_win", "occ_top2_naics3_win")
rand_vars_ru <- c("corp_tax", "income_tax", "wage",  "occ_top2_naics3")

rand_coefs <- coef(mle)[grepl("sigma", names(coef(mle))) & names(coef(mle)) != "sigma"]
rand_eta <- matrix(nrow = nrow(cur_df), ncol = 50)
for (i in 1:50) {
    rand_eta[, i] <- rnorm(nrow(cur_df))
}


X_rand <- create_mat(cur_df, rand_vars_win)
profits_win_rand_eta <- matrix(nrow = nrow(cur_df), ncol = 50)
for (i in 1:50) {
    profits_win_rand_eta[, i] <- profits_win_cond + (X_rand %*% rand_coefs) * rand_eta[,i]
}


X_rand <- create_mat(cur_df, rand_vars_ru)
profits_ru_rand_eta <- matrix(nrow = nrow(cur_df), ncol = 50)
for (i in 1:50) {
    profits_ru_rand_eta[, i] <- profits_ru_cond + (X_rand %*% rand_coefs) * rand_eta[,i]

}

profits_win_rand_eta <- as.vector(profits_win_rand_eta)  
profits_ru_rand_eta <- as.vector(profits_ru_rand_eta)

##export for figures 
write.csv(profits_ru_rand_eta , "data/profits_rc.csv")

manufpi <- c(profits_ru_rand_eta, profits_win_rand_eta)
#-------------------------------------------------------------------------------#
#DISTRIBUTION OF WELFARE

#GOING TO USE CONDITIONAL W, AND THEN WE CAN ADD BACK IN 
#welfare: 
w_manuf <- as.data.frame(welfare_ru_cond) #w baseline
colnames(w_manuf) <- c("wbase")

#################################################################### Welfare in all locations 
# WHAT WE HAVE IS WELFARE IN THE RUNNER-UP LOCATION 
# --> SINCE THE AUCTION IS ON WELFARE (winning place has the highest pi+v)
# --> welfare in the runner-up place is the 2nd highest pi+v
# --> result from the auction literature: if we observe 2nd highest bidders w we can get full distribution of w
# --> the thing is for that result, we need the number of bidders

# WE HAVE NUMBER OF BIDDERS IN THE DATA: n_bid_win
w_manuf$nbid <- cur_df$n_bid

ru_m2<- w_manuf[which(w_manuf$nbid==2), 1:2]
ru_m3<- w_manuf[which(w_manuf$nbid==3), 1:2]
ru_m4<- w_manuf[which(w_manuf$nbid==4), 1:2]
ru_m5<- w_manuf[which(w_manuf$nbid==5), 1:2]
ru_m11<- w_manuf[which(w_manuf$nbid>5), 1:2]

t <- seq(-500,1800, by=1)

p2 <- parentcdf(ecdf(ru_m2$wbase)(t), 1,2 )
p3 <- parentcdf(ecdf(ru_m3$wbase)(t), 2,3 )
p4 <- parentcdf(ecdf(ru_m4$wbase)(t), 3,4 )
p5 <- parentcdf(ecdf(ru_m5$wbase)(t), 4,5 )
p11 <- parentcdf(ecdf(ru_m11$wbase)(t), 10,11 )

# now create mixture distribution
Fw <- (p2*length(ru_m2$wbase) + p3*length(ru_m3$wbase) + p4*length(ru_m4$wbase) +p5*length(ru_m5$wbase)  + p11*length(ru_m11$wbase))/length(w_manuf$wbase) 

#################################################################### Copula
#### possible values of v (manufacturing)
Funhat_mix <-  approxfun(t,Fw) 

# THIS IS THE CORRELATION BETWEEN RUNNER-UP PROFITS AND RUNNER-UP VALUATIONS
tau_base <- cor(profits_ru_cond, v_ru_cond_resid, method = "kendall")

# translate to the AMH dependence parameter 
d <- uniroot(function(x) ((3*x -2)/(3*x)) - (((2*(1 - x)^2)*log(1 - x))/(3*x^2)) - tau_base, interval=c(-1,-.00001))
d <- d$root

#joint density of copula
cop <- function(Hv, Hp, d) {
   (1+ d*((1+Hv)*(1+Hp)-3) + (d^2)* ((1-Hv)*(1-Hp)))/((1-(d*(1-Hv)*(1-Hp)))^3)
}

#For each t, find Hv that minimizes difference between Hv_input and Hv_predicted
#######################################################################################################

t <- seq(-300,1200, by=5)
Hv_input <- seq(0,1, by=.01)

cond_cop <- function(hpi, hv, d) {
  (hpi*(1-d*(1-hpi)))/((1-d*(1-hv)*(1-hpi))^2)
}
H_pi<- ecdf(manufpi)

#create grid of guesses for Hv
#for each guess Hv^g, draw pi from \hat{C}, S times 

p <- seq(-400,1000, by=1)
H_pi_sim <- 0
diff <- 0 
u <- runif(1000)

Hv_solve=0
for (y in 1:length(t)) { #possible values of v
  Hv_predicted=0
  for (z in 1:101) { #possible values of Hv 
    #first need to simulate Hpi
    H_pi_sim <- cond_cop(H_pi(p),Hv_input[z],d)
    Hp_sim <- approxfun(p, H_pi_sim)
    InvHpi = approxfun(Hp_sim(p),p)
    profits <- InvHpi(u)
    profits[which(is.na(profits))] <- -200
    tw <- t[y] + profits  
    tw[which(tw> 1800)]<- 1800
    tw[which(tw< -500)]<- -500
    Hv_predicted[z] <- sum(Funhat_mix(tw))/length(tw)
    diff[z] <- abs(Hv_input[z] - Hv_predicted[z])
  }
    Hv_solve[y] <- Hv_predicted[[which.min(diff)]]
}

####Manufacturing
x<- runif(10000, min=.1, max=.9995)
H_manuf <- approxfun(Hv_solve,t)
sample <-  H_manuf(x)
sample_fit <-  H_manuf(x)
sample_fit[which(sample_fit<=0)] <- 1
fit <- fitdistr(na.exclude(sample_fit), "lognormal")
sample_ln <- rlnorm(10000, fit$estimate['meanlog'], fit$estimate['sdlog'])


########################################### export for some figures/analysis
write.csv(w_manuf$wbase, "data/welfare_manuf.csv")
write.csv(sample_ln[which(sample_ln<1500)], "data/Hv_manuf.csv")
results <- as.data.frame(coef(mle))
write.csv(rbind(results, min_profit_win), "data/coef_manuf.csv")

#################################################### COUNTERFACTUAL AND MODEL FIT 

#################################################################################
#-------------------------------------------------------------------------------#
# 1. Create sampled shortlist: 
#################################################################################
df <- read_stata("data/analysis_cz_long.dta")

#create dummies 
df$manuf_type_1 <- as.integer(df$manuf_type == 1)
df$manuf_type_3 <- as.integer(df$manuf_type == 3)

#drop outliers 
df <- filter(df, sub_M < 1000, df$manuf==1) 

df_sample <- filter(df, runnerup==1 | winner==1 | pop>200000 )
df_sample <- df_sample %>% filter(runnerup==1 | winner==1 | occ_top2_naics3>1)
df_sample <- df_sample %>% group_by(id_deal) %>% arrange(id_deal, desc(perc_est_n4))
df_sample <- df_sample %>% filter(runnerup==1 | winner==1 | row_number()<=100 )
df_sample <- df_sample %>% filter(runnerup==1 | winner==1 |  row_number()<=60 | r2w==1)
df_sample <- df_sample %>% filter(runnerup==1 | winner==1 | airport_any==1 | row_number()<=40 | r2w==1)
df_sample <- df_sample %>% ungroup() %>% mutate(n_bid = replace(n_bid, n_bid>15, 15))

#number of times to simulate shortlist: 
n_sim = 40
N_sim = n_sim*n_sim 

df_shortlist <- df_sample %>% crossing(sim = c(1:n_sim)) %>%  #this expands dataset to simulate over shortlists
  group_by(row_number()) %>% mutate(nsim=rnorm(1,0,1)) %>% ungroup() %>% #create random variable for sorting
  mutate(id_deal_sim = id_deal*100 + sim) %>% #new id_deal var, to track simulation number
  group_by(id_deal_sim) %>% # group by id_deal and simulation 
  arrange(id_deal_sim, desc(winner), desc(runnerup), nsim) %>% #sort 
  filter(row_number()<=n_bid) %>% select(-nsim) #only as many rows as there are bidders for each simulation 

df_shortlist <- df_shortlist %>% crossing(eta = c(1:n_sim)) %>% #this expands dataset to simulate over rand_eta draws
  mutate(id_deal_sim_eta = id_deal_sim*100+eta) %>% 
  group_by(id_deal_sim_eta) %>%  arrange(id_deal_sim_eta) %>%
  mutate(rand_eta = rnorm(1,0,1))  %>% select(-eta)


#################################################################################
#-------------------------------------------------------------------------------#
# 2. PREDICT PROFITS  
#################################################################################
  
X <- create_mat(df_shortlist, profit_vars_ru)
rand_vars <- c("corp_tax", "income_tax", "wage",  "occ_top2_naics3")
X_rand <- create_mat(df_shortlist, rand_vars)
df_shortlist$profits_rand_eta <- X %*% profit_vars_mle + (X_rand %*% rand_coefs)*df_shortlist$rand_eta - min_profit_win 
df_shortlist$profits_base  <- X %*% profit_vars_mle - min_profit_win 
# probably want to winsorize here. 
df_shortlist$profits_rand_eta <- winsor(df_shortlist$profits_rand_eta, trim=.01)


#################################################################################
#-------------------------------------------------------------------------------#
# 3. PICK WINNERS IN CF (profit only), MODEL FIT (simulated subsidy game) 
#################################################################################

#figure out quantile of each pi, and then map to quantile of v, simulate from copula. 

#empirical CDF of pi: 
H_pi1<- ecdf(df_shortlist$profits_rand_eta)

#given any value of pi, will give the location in the distribution 
#winning locations must have weakly positive profits
df_shortlist <- df_shortlist %>% mutate(profits_rand_eta=ifelse(profits_rand_eta<0 & winner==1, 0, profits_rand_eta)) %>%
  mutate(profits_base=ifelse(profits_base<0 & winner==1, 0, profits_base)) %>%
  mutate(Hpi1 = H_pi1(profits_rand_eta)) 
#%>% mutate(Hpi2 = H_pi2(profits_base))

#winners in data: 
df_shortlist_winners <- df_shortlist %>% filter(winner==1)
win_loc <- df_shortlist_winners %>% group_by(id_deal, winner, stateabbrev, sub_M) %>% 
  summarize(profit_win_base=mean(profits_base), profit_win_rand = mean(profits_rand_eta))
ru_loc <- df_shortlist %>% ungroup() %>% filter(runnerup==1) %>% select(all_of(c("runnerup", "id_deal", "stateabbrev"))) %>% distinct()

#################################################################################
# SUBSIDY GAME WINNER
#################################################################################
#Now for each location need to simulate valuation, given pi

#this is relationship between pi and v
cop_amh <- amhCopula(d, dim=2)

cf_vars <- c("id_deal", "id_deal_sim", "id_deal_sim_eta", "profits_rand_eta", "profits_base", 
             "winner", "runnerup", "stateabbrev", "statefip", "cz", "Hpi1", "sub_M", "personal_inc_pc", "unemp")

#here we simulate v for each simulated pi (pi w and without random coef.)
sl_sim_v <- df_shortlist %>% select(all_of(cf_vars))
sl_sim_v <- sl_sim_v %>% ungroup() %>%  group_by(row_number()) %>% mutate(u = runif(1)) %>% ungroup() 
sl_sim_v <- sl_sim_v  %>% 
  mutate(Hv1 = cCopula(cbind(Hpi1, u), copula=cop_amh, indices=2, inverse=TRUE)) %>%
  mutate(v1 = quantile(sample_ln[which(sample_ln<1500)], Hv1, names=FALSE)) 

#restriction: valuation in winning place cannot be less than 75% of observed subsidy. 
sl_sim_v <- sl_sim_v %>% mutate(v1=ifelse(winner==1 & v1<.75*sub_M, .75*sub_M, v1) )

#now choose winning location in each subsidy competition 
sim_win <- sl_sim_v %>% mutate(w = v1+profits_rand_eta) %>%
  group_by(id_deal_sim_eta) %>% arrange(id_deal, id_deal_sim, id_deal_sim_eta, desc(w)) %>%
  mutate(sim_sub = lead(w)-profits_rand_eta)  %>% filter(row_number()==1)

write.csv(sim_win, "data/sim_sub.csv")

#################################################################################
# COUNTERFACTUAL WINNER
################################################################################
cf_vars <- c("id_deal", "id_deal_sim", "id_deal_sim_eta", "profits_rand_eta", "profits_base", 
             "winner", "runnerup", "stateabbrev", "statefip", "cz", "Hpi1", "sub_M", "v1")

#winner in counterfactual: 
df_shortlist_cf_win <- sl_sim_v %>% group_by(id_deal_sim_eta) %>% 
  arrange(id_deal, id_deal_sim, id_deal_sim_eta, desc(profits_rand_eta), desc(winner)) %>% 
  select(all_of(cf_vars)) %>% filter(row_number()==1)


#################################################################################
# MODEL FIT
################################################################################

#check fit of model in terms of predicted winning locations
real_location <- df_shortlist_winners %>% group_by(stateabbrev) %>% summarize(no_obs = n()) %>% mutate(perc_loc = 100*no_obs/nrow(df_shortlist_winners)) 

sim_location <- sim_win %>% group_by(stateabbrev) %>% summarize(no_obs_sim = n()) %>% mutate(perc_loc_sim = 100*no_obs_sim/nrow(sim_win)) 

comp_location <- merge(real_location, sim_location, by="stateabbrev", all.x=TRUE, all.y=TRUE)
write.csv(comp_location, "data/modelfit_winninglocations.csv")

#don't always pick the right place when simulate (with subsidies)
#here percent is by the number of simulations
sim_location_id <- sim_win %>% group_by(id_deal, stateabbrev) %>% summarize(no_obs_sim = n()) %>% 
  mutate(perc_loc_sim = 100*no_obs_sim/N_sim) %>% mutate(adjustment = 1-perc_loc_sim/100)

#merge this in with the counterfactual winners to make the adjustment: 
#track and scale the valuations appropriately for the counterfactual table 

sim_cf_win <- df_shortlist_cf_win %>% group_by(id_deal, stateabbrev, winner) %>%  
  summarize(no_obs_cf = n(), mean_v = mean(v1), mean_pi = mean(profits_rand_eta))
sim_location_cf <- merge(sim_cf_win , sim_location_id, by=c("id_deal", "stateabbrev"), all.x=TRUE) 

#make sure each deal has the winning location as an option, and no need for adjustment for winners: 
sim_location_cf <- merge(sim_location_cf, win_loc, by=c("id_deal", "stateabbrev", "winner"), all.x=TRUE, all.y=TRUE) %>%
  mutate(adjustment = ifelse(winner==1,1,adjustment)) 

#if missing in model simulation then no need for adjustment:
sim_location_cf$adjustment[is.na(sim_location_cf$adjustment)] <- 1

sim_location_cf_summary <- sim_location_cf %>%  mutate(no_obs_cf_adj = adjustment*no_obs_cf) %>%
  group_by(id_deal, winner) %>% arrange(id_deal, desc(winner)) %>% 
  summarize(sum_adj = sum(no_obs_cf_adj), mean_v = mean(mean_v), mean_pi = mean(mean_pi)) 
share_winner <- sim_location_cf_summary %>% arrange(id_deal, desc(winner)) %>% 
  mutate(share = 100*(N_sim-lead(sum_adj))/N_sim) %>% filter(row_number()==1)

share_winner$share[is.na(share_winner$share)] <- 100  
share_winner <- merge(share_winner, win_loc, by=c("id_deal", "winner")) %>% select(-c("sum_adj"))

#now will append with the runners-up and non-runners-up/winners 
share_other <- sim_location_cf %>%  mutate(share = 100*adjustment*no_obs_cf/N_sim) %>% filter(winner==0) %>% select(-c("winner"))
share_other <- merge(share_other, ru_loc, by=c("id_deal", "stateabbrev"), all.x=TRUE) %>% 
  select(all_of(c("id_deal", "runnerup", "stateabbrev", "share", "mean_v", "mean_pi")))
share_other$runnerup[is.na(share_other$runnerup)] <- 0

cf_loc <- bind_rows(share_winner, share_other) %>% group_by(id_deal) %>% arrange(id_deal)
cf_loc$runnerup[is.na(cf_loc$runnerup)] <- 0
cf_loc$winner[is.na(cf_loc$winner)] <- 0
cf_loc <- cf_loc %>% mutate(mean_v=ifelse(is.na(mean_v),sub_M, mean_v))  %>% 
  mutate(mean_pi=ifelse(is.na(mean_pi),profit_win_rand, mean_pi)) %>% mutate(mean_pi=ifelse(mean_pi==0 & winner==1,profit_win_rand, mean_pi)) %>%
  mutate(adj_v = mean_v*(share/100)) %>%  mutate(adj_pi = mean_pi*(share/100))
write.csv(cf_loc, "data/cf_locations.csv")

#-------------------------------------------------------------------------------#
# NOW FOR SERVICES 
#-------------------------------------------------------------------------------#
# Filter data + create dummies

df <- read_stata("data/analysis_cz.dta")
df <- filter(df, ru_limit == 1) # filter to only include CZs with runner-ups
df <- filter(df, sub_M < 400)

df$serv <- !df$manuf

df$trade_0 <- as.integer(df$trade == 0)
df$trade_1 <- as.integer(df$trade == 1)

df$lowserv_0 <- as.integer(df$low_serv == 0)
df$lowserv_1 <- as.integer(df$low_serv == 1)

# sample one runner-up within subsidy ID
set.seed(1)
df <- df %>%
  group_by(id_win) %>%
  slice_sample(n = 1)

# Generate random variables
set.seed(1)
n_rand = 1000
rand_obs = matrix(nrow = nrow(df), ncol = n_rand)
for (i in 1:n_rand) {
  rand_obs[, i] <- rnorm(nrow(df))
}
#-------------------------------------------------------------------------------#
### RUN MLE 

cur_df <- filter(df, serv == 1)
cur_rand_obs <- rand_obs[df$serv == 1, ]

profit_vars <- c("diff_corp_tax", 
                 "diff_income_tax",
                 "diff_proptax",
                 "diff_r2w",
                 "diff_FHFA",
                 "diff_e_comm",
                 "diff_auto_roadnetwork",
                 "diff_incwage_top_5:lowserv_0",
                 "diff_incwage_top_5:lowserv_1",
                 "diff_FHFA:invest_B",
                 "diff_airport_any:trade_1",
                 "diff_top_RD:trade_0",
                 "diff_top_RD:trade_1",
                 "diff_income_tax:jobs_direct")

profit_vars_win <- c("corp_tax_win", 
                     "income_tax_win",
                     "proptax_win",
                     "r2w_win",
                     "FHFA_win",
                     "e_comm_win",
                     "auto_roadnetwork_win",
                     "incwage_top_5_win:lowserv_0",
                     "incwage_top_5_win:lowserv_1",
                     "FHFA_win:invest_B",
                     "airport_any_win:trade_1",
                     "top_RD_win:trade_0",
                     "top_RD_win:trade_1",
                     "income_tax_win:jobs_direct")

profit_vars_ru <- c("corp_tax", 
                    "income_tax",
                    "proptax",
                    "r2w",
                    "FHFA",
                    "e_comm",
                    "auto_roadnetwork",
                    "incwage_top_5:lowserv_0",
                    "incwage_top_5:lowserv_1",
                    "FHFA:invest_B",
                    "airport_any:trade_1",
                    "top_RD:trade_0",
                    "top_RD:trade_1",
                    "income_tax:jobs_direct")


v_vars <- c("jobs_direct",
            "mult_total",
            "invest_B",
            "income_tax",
            "corp_tax",
            "sales_tax",
            "proptax",
            "term_limit",
            "unemp",
            "incwage_top_5",
            "ln_pinc")

# variables for random coefficients
rand_vars_extend <- c("diff_corp_tax", 
                      "diff_income_tax",
                      "diff_proptax",
                      "diff_r2w",
                      "diff_FHFA",
                      "diff_auto_roadnetwork")


rand_vars <- c("diff_corp_tax", 
               "diff_income_tax",
               "diff_proptax",
               "diff_r2w")

#########
X <- create_mat(cur_df, c(profit_vars, v_vars, rand_vars))
na_filter <- rowSums(is.na(X)) == 0
cur_df <- cur_df[na_filter, ]
cur_rand_obs <- cur_rand_obs[na_filter, ]

# RUN MLE
vars <- c(profit_vars, v_vars)
res <- run_mle_rand(cur_df, cur_rand_obs, vars, rand_vars)

## regression coefficient tables
# export to csv so can make table for paper:
mle_tab <- coef(summary(res$mle))[, 1:2]
write.csv(mle_tab, "data/mle_coef_serv.csv")
#-------------------------------------------------------------------------------#
## Calculate profits, welfare, and valutions; output figs + summary stats
mle <- res$mle

# calculate residuals
reg_coef <- coef(mle)[!grepl("sigma", names(coef(mle)))]
X <- create_mat(cur_df, names(reg_coef))
resid <- cur_df$sub_M - (X %*% reg_coef)

profit_vars_mle <- coef(mle)[grepl("diff", names(coef(mle))) & !grepl("sigma", names(coef(mle)))]
v_vars_mle <- coef(mle)[!grepl("diff", names(coef(mle)))  & !grepl("sigma", names(coef(mle)))]

# Win profits
X <- create_mat(cur_df, profit_vars_win)
profits_win <- X %*% profit_vars_mle

# Ru profits
X <- create_mat(cur_df, profit_vars_ru)
profits_ru <- X %*% profit_vars_mle

# Ru valuation
X <- create_mat(cur_df, v_vars)
v_ru <- X %*% v_vars_mle

# subtract min profits + calculate welfare
min_profit <- min(profits_win)
profits_win_orig <- profits_win
profits_ru_orig <- profits_ru
profits_win <- profits_win_orig - min_profit
profits_ru <- profits_ru_orig - min_profit
welfare_ru_orig <- profits_ru_orig + v_ru + resid
welfare_ru <- profits_ru + v_ru + resid
v_ru_resid <- v_ru + resid

# v_ru_cond
v_ru_cond <- v_ru -
  coef(mle)['jobs_direct'] * cur_df$jobs_direct -
  coef(mle)['mult_total'] * cur_df$mult_total -
  coef(mle)['invest_B'] * cur_df$invest_B +
  coef(mle)['jobs_direct']  * mean(cur_df$jobs_direct)  +
  coef(mle)['mult_total'] * mean(cur_df$mult_total) +
  coef(mle)['invest_B'] * mean(cur_df$invest_B) 
v_ru_cond_resid =winsor(v_ru_cond + resid, trim=.01)

# profit_ru_cond
profits_ru_cond <- profits_ru -
  coef(mle)['diff_income_tax:jobs_direct'] * (cur_df$income_tax * cur_df$jobs_direct) +
  coef(mle)['diff_income_tax:jobs_direct'] * cur_df$income_tax * mean(cur_df$jobs_direct)  -
  coef(mle)['diff_FHFA:invest_B'] * (cur_df$FHFA * cur_df$invest_B) +
  coef(mle)['diff_FHFA:invest_B'] * cur_df$FHFA * mean(cur_df$invest_B)

profits_win_cond <- profits_win -
  coef(mle)['diff_income_tax:jobs_direct'] * (cur_df$income_tax_win * cur_df$jobs_direct) +
  coef(mle)['diff_income_tax:jobs_direct'] * cur_df$income_tax_win  * mean(cur_df$jobs_direct)  -
  coef(mle)['diff_FHFA:invest_B'] * (cur_df$FHFA_win * cur_df$invest_B) +
  coef(mle)['diff_FHFA:invest_B'] * cur_df$FHFA_win * mean(cur_df$invest_B)

## plot ru profits vs. valuations
corr <- cbind(profits_ru_cond, profits_ru, v_ru, v_ru_cond, v_ru_cond_resid, v_ru_resid, cur_df$id_win)
write.csv(corr, "data/runner_up_correlations_serv.csv")

welfare_ru_cond <- profits_ru_cond + v_ru_cond_resid

# distribution of profits with and without random coefs
rand_coefs <- coef(mle)[grepl("sigma", names(coef(mle))) & names(coef(mle)) != "sigma"]
rand_eta <- matrix(nrow = length(rand_coefs), ncol = 50)
rownames(rand_eta) <- names(rand_coefs)
for (i in 1:50) {
  rand_eta[, i] <- rnorm(length(rand_coefs))
}

X <- create_mat(cur_df, profit_vars_win)
X_rand <- create_mat(cur_df, rand_vars)
profits_win_rand_eta <- matrix(nrow = nrow(cur_df), ncol = 50)
for (i in 1:50) {
  profits_win_rand_eta[, i] <- X %*% profit_vars_mle + X_rand %*% (rand_eta[, i] * rand_coefs)-
    coef(mle)['diff_income_tax:jobs_direct'] * (cur_df$income_tax_win * cur_df$jobs_direct) +
    coef(mle)['diff_income_tax:jobs_direct'] * cur_df$income_tax_win * mean(cur_df$jobs_direct)  -
    coef(mle)['diff_FHFA:invest_B'] * (cur_df$FHFA_win * cur_df$invest_B) +
    coef(mle)['diff_FHFA:invest_B'] * cur_df$FHFA_win * mean(cur_df$invest_B)
}

X <- create_mat(cur_df, profit_vars_ru)
X_rand <- create_mat(cur_df, rand_vars)
profits_ru_rand_eta <- matrix(nrow = nrow(cur_df), ncol = 50)
for (i in 1:50) {
  profits_ru_rand_eta[, i] <- X %*% profit_vars_mle + X_rand %*% (rand_eta[, i] * rand_coefs) -
    coef(mle)['diff_income_tax:jobs_direct'] * (cur_df$income_tax * cur_df$jobs_direct) +
    coef(mle)['diff_income_tax:jobs_direct'] * cur_df$income_tax * mean(cur_df$jobs_direct) -
    coef(mle)['diff_FHFA:invest_B'] * (cur_df$FHFA * cur_df$invest_B) +
    coef(mle)['diff_FHFA:invest_B'] * cur_df$FHFA * mean(cur_df$invest_B)
}

profits_win_rand_eta <- as.vector(profits_win_rand_eta) - min_profit
profits_ru_rand_eta <- as.vector(profits_ru_rand_eta) - min_profit

##export for figures 
write.csv(profits_ru_rand_eta , "data/profits_rc_serv.csv")

servpi <- c(profits_ru_rand_eta, profits_win_rand_eta)
#-------------------------------------------------------------------------------#
#DISTRIBUTION OF WELFARE

#GOING TO USE CONDITIONAL W, AND THEN WE CAN ADD BACK IN 
#welfare: 
w_serv <- as.data.frame(welfare_ru_cond) #w baseline
colnames(w_serv) <- c("wbase")


#################################################################### Welfare in all locations 

# WHAT WE HAVE IS WELFARE IN THE RUNNER-UP LOCATION 
# --> SINCE THE AUCTION IS ON WELFARE (winning place has the highest pi+v)
# --> welfare in the runner-up place is the 2nd highest pi+v
# --> result from the auction literature: if we observe 2nd highest bidders w we can get full distribution of w
# --> the thing is for that result, we need the number of bidders


# WE HAVE NUMBER OF BIDDERS IN THE DATA: n_bid_win

w_serv$nbid <- cur_df$n_bid

ru_m2<- w_serv[which(w_serv$nbid==2), 1:2]
ru_m3<- w_serv[which(w_serv$nbid==3), 1:2]
ru_m4<- w_serv[which(w_serv$nbid==4), 1:2]
ru_m5<- w_serv[which(w_serv$nbid==5), 1:2]
ru_m11<- w_serv[which(w_serv$nbid>5), 1:2]

t <- seq(-500,1500, by=1)

p2 <- parentcdf(ecdf(ru_m2$wbase)(t), 1,2 )
p3 <- parentcdf(ecdf(ru_m3$wbase)(t), 2,3 )
p4 <- parentcdf(ecdf(ru_m4$wbase)(t), 3,4 )
p5 <- parentcdf(ecdf(ru_m5$wbase)(t), 4,5 )
p11 <- parentcdf(ecdf(ru_m11$wbase)(t), 10,11 )

# now create mixture distribution
Fw <- (p2*length(ru_m2$wbase) + p3*length(ru_m3$wbase) + p4*length(ru_m4$wbase) +p5*length(ru_m5$wbase)  + p11*length(ru_m11$wbase))/length(w_serv$wbase) 

#################################################################### Copula
#### possible values of v
Funhat_mix <-  approxfun(t,Fw) 

#first, actually, have to calculate dependence parameter

# THIS IS THE CORRELATION BETWEEN RUNNER-UP PROFITS AND RUNNER-UP VALUATIONS
tau_base <- cor(profits_ru_cond, v_ru_cond_resid, method = "kendall")

# translate to the AMH dependence parameter 
d <- uniroot(function(x) ((3*x -2)/(3*x)) - (((2*(1 - x)^2)*log(1 - x))/(3*x^2)) - tau_base, interval=c(-2,-.00001))
d <- max(-1,d$root)


#joint density of copula
cop <- function(Hv, Hp, d) {
  (1+ d*((1+Hv)*(1+Hp)-3) + (d^2)* ((1-Hv)*(1-Hp)))/((1-(d*(1-Hv)*(1-Hp)))^3)
}

#Simulate values for pi, Hp 
#######################################################################################################
#profit distribution is given by calculated profits using beta-hat, sigma
#therefore, just going to use the simulated random coef profits, but drop <0 

servpi <- c(profits_ru_rand_eta, profits_win_rand_eta)
servpi <- winsor(servpi, trim=.02)

#For each t, find Hv that minimizes difference between Hv_input and Hv_predicted
#######################################################################################################

t <- seq(-300,1200, by=5)
Hv_input <- seq(0,1, by=.01)

cond_cop <- function(hpi, hv, d) {
  (hpi*(1-d*(1-hpi)))/((1-d*(1-hv)*(1-hpi))^2)
}
H_pi<- ecdf(servpi)

#creat grid of guesses for Hv
#for each guess Hv^g, draw pi from \hat{C}, S times 

p <- seq(-400,1000, by=1)
H_pi_sim <- 0
diff <- 0 
u <- runif(1000)
# DO FOR EACH NUMBER OF BIDDERS

Hv_solve=0
for (y in 1:length(t)) { #possible values of v
  Hv_predicted=0
  for (z in 1:101) { #possible values of Hv 
    #first need to simulate Hpi
    H_pi_sim <- cond_cop(H_pi(p),Hv_input[z],d)
    Hp_sim <- approxfun(p, H_pi_sim)
    InvHpi = approxfun(Hp_sim(p),p)
    profits <- InvHpi(u)
    tw <- t[y] + profits  
    tw[which(tw> 1500)]<- 1500
    tw[which(tw< -500)]<- -500
    Hv_predicted[z] <- sum(Funhat_mix(tw))/length(tw)
    diff[z] <- abs(Hv_input[z] - Hv_predicted[z])
  }
  Hv_solve[y] <- Hv_predicted[[which.min(diff)]]
}


x<- runif(10000, min=.01, max=.9999)
H_serv <- approxfun(Hv_solve,t)
sample <-  H_serv(x)
sample_fit <-  na.omit(sample)
sample_fit[which(sample_fit<=0)] <- 1
fit <- fitdistr(sample_fit, "lognormal")
sample_ln <- rlnorm(10000, fit$estimate['meanlog'], fit$estimate['sdlog'])
summary(sample)
summary(sample_ln[which(sample_ln<1000)])


########################################### export for some figures/analysis
write.csv(w_serv$wbase, "data/welfare_serv.csv")
write.csv(sample_ln[which(sample_ln<1000)], "data/Hv_serv.csv")
results <- as.data.frame(coef(mle))
write.csv(rbind(results, min_profit), "data/coef_serv.csv")

#################################################### COUNTERFACTUAL AND MODEL FIT 

#################################################################################
#-------------------------------------------------------------------------------#
# 1. Create sampled shortlist: 
#################################################################################
df <- read_stata("data/analysis_cz_long.dta")

#drop outlier 
df <- filter(df, sub_M < 400, df$manuf==0) 

df$trade_0 <- as.integer(df$trade == 0)
df$trade_1 <- as.integer(df$trade == 1)

df$lowserv_0 <- as.integer(df$low_serv==0)
df$lowserv_1 <- as.integer(df$low_serv==1)

df_sample <- filter(df, runnerup==1 | winner==1 |  pop>200000)
df_sample <- df_sample %>% filter(runnerup==1 | winner==1 | occ_top2_naics3>1)
df_sample <- df_sample %>% group_by(id_deal) %>% arrange(id_deal, desc(perc_est_n4))
df_sample <- df_sample %>% filter(runnerup==1 | winner==1 | airport_any==1 | row_number()<=50 )
df_sample <- df_sample %>% ungroup() %>% mutate(n_bid = replace(n_bid, n_bid>15, 15))

#number of times to simulate shortlist: 
n_sim = 40
N_sim = n_sim*n_sim 

df_shortlist <- df_sample %>% crossing(sim = c(1:n_sim)) %>%  #this expands dataset to simulate over shortlists
  group_by(row_number()) %>% mutate(nsim=rnorm(1,0,1)) %>% ungroup() %>% #create random variable for sorting
  mutate(id_deal_sim = id_deal*100 + sim) %>% #new id_deal var, to track simulation number
  group_by(id_deal_sim) %>% # group by id_deal and simulation 
  arrange(id_deal_sim, desc(winner), desc(runnerup), nsim) %>% #sort 
  filter(row_number()<=n_bid) %>% select(-nsim) #only as many rows as there are bidders for each simulation 

df_shortlist <- df_shortlist %>% crossing(eta = c(1:n_sim)) %>% #this expands dataset to simulate over rand_eta draws
  mutate(id_deal_sim_eta = id_deal_sim*100+eta) %>% 
  group_by(id_deal_sim_eta) %>%  arrange(id_deal_sim_eta) %>%
  mutate(rand_eta = rnorm(1,0,1))  %>% select(-eta)


#################################################################################
#-------------------------------------------------------------------------------#
# 2. PREDICT PROFITS  
#################################################################################

X <- create_mat(df_shortlist, profit_vars_ru)
rand_vars <- c("corp_tax", "income_tax", "proptax",  "r2w")
X_rand <- create_mat(df_shortlist, rand_vars)
df_shortlist$profits_rand_eta <- X %*% profit_vars_mle + (X_rand %*% rand_coefs)*df_shortlist$rand_eta - min_profit
df_shortlist$profits_base  <- X %*% profit_vars_mle - min_profit
# probably want to winsorize here. 
df_shortlist$profits_rand_eta <- winsor(df_shortlist$profits_rand_eta, trim=.01)


#################################################################################
#-------------------------------------------------------------------------------#
# 2. PICK WINNERS IN CF (profit only), MODEL FIT (simulated subsidy game) 
#################################################################################

#figure out quantile of each pi, and then map to quantile of v, simulate from copula. 

#empirical CDF of pi: 
H_pi1<- ecdf(df_shortlist$profits_rand_eta)
H_pi2<- ecdf(df_shortlist$profits_base)

#given any value of pi, will give the location in the distribution 
#winning locations must have weakly positive profits
df_shortlist <- df_shortlist %>% mutate(profits_rand_eta=ifelse(profits_rand_eta<0 & winner==1, 0, profits_rand_eta)) %>%
  mutate(profits_base=ifelse(profits_base<0 & winner==1, 0, profits_base)) %>%
  mutate(Hpi1 = H_pi1(profits_rand_eta)) %>% mutate(Hpi2 = H_pi2(profits_base))

#winners in data: 
df_shortlist_winners <- df_shortlist %>% filter(winner==1)
win_loc <- df_shortlist_winners %>% group_by(id_deal, winner, stateabbrev, sub_M) %>% 
  summarize(profit_win_base=mean(profits_base), profit_win_rand = mean(profits_rand_eta))
ru_loc <- df_shortlist %>% ungroup() %>% filter(runnerup==1) %>% select(all_of(c("runnerup", "id_deal", "stateabbrev"))) %>% distinct()

#################################################################################
# SUBSIDY GAME WINNER
#################################################################################
#OK, now for each simulated location, need to simulate valuation, given pi

#this is relationship between pi and v
cop_amh <- amhCopula(d, dim=2)

cf_vars <- c("id_deal", "id_deal_sim", "id_deal_sim_eta", "profits_rand_eta", "profits_base", 
             "winner", "runnerup", "stateabbrev", "statefip", "cz", "Hpi1", "Hpi2", "sub_M", "unemp", "personal_inc_pc" )

#here we simulate v for each simulated pi (pi w and without random coef.)
sl_sim_v <- df_shortlist %>% select(all_of(cf_vars))
sl_sim_v <- sl_sim_v %>% ungroup() %>%  group_by(row_number()) %>% mutate(u = runif(1)) %>% ungroup() 
sl_sim_v <- sl_sim_v  %>% 
  mutate(Hv1 = cCopula(cbind(Hpi1, u), copula=cop_amh, indices=2, inverse=TRUE)) %>%
  mutate(v1 = quantile(sample_ln[which(sample_ln<800)], Hv1, names=FALSE)) 

#valuations in winning places constrained to be 75% of observed subsidy
sl_sim_v <- sl_sim_v %>% mutate(v1=ifelse(winner==1 & v1<.75*sub_M, .75*sub_M, v1) )

#now choose winning location in each subsidy competition 
sim_win <- sl_sim_v %>% mutate(w = v1+profits_rand_eta) %>%
  group_by(id_deal_sim_eta) %>% arrange(id_deal, id_deal_sim, id_deal_sim_eta, desc(w)) %>%
  mutate(sim_sub = lead(w)-profits_rand_eta)  %>% filter(row_number()==1)
write.csv(sim_win, "data/sim_sub_serv.csv")

#################################################################################
# COUNTERFACTUAL WINNER
################################################################################
cf_vars <- c("id_deal", "id_deal_sim", "id_deal_sim_eta", "profits_rand_eta", "profits_base", 
             "winner", "runnerup", "stateabbrev", "statefip", "cz", "Hpi1", "Hpi2", "sub_M", "v1")

#winner in counterfactual: 
df_shortlist_cf_win <- sl_sim_v %>% group_by(id_deal_sim_eta) %>% 
  arrange(id_deal, id_deal_sim, id_deal_sim_eta, desc(profits_rand_eta), desc(winner)) %>% 
  select(all_of(cf_vars)) %>% filter(row_number()==1)

#################################################################################
# MODEL FIT
################################################################################

#check fit of model in terms of predicted winning locations
real_location <- df_shortlist_winners %>% group_by(stateabbrev) %>% summarize(no_obs = n()) %>% mutate(perc_loc = 100*no_obs/nrow(df_shortlist_winners)) 

sim_location <- sim_win %>% group_by(stateabbrev) %>% summarize(no_obs_sim = n()) %>% mutate(perc_loc_sim = 100*no_obs_sim/nrow(sim_win)) 

comp_location <- merge(real_location, sim_location, by="stateabbrev", all.x=TRUE, all.y=TRUE)
write.csv(comp_location, "data/modelfit_winninglocations_serv.csv")

#don't always pick the right place when simulate (with subsidies)
#here percent is by the number of simulations
sim_location_id <- sim_win %>% group_by(id_deal, stateabbrev) %>% summarize(no_obs_sim = n()) %>% 
  mutate(perc_loc_sim = 100*no_obs_sim/N_sim) %>% mutate(adjustment = 1-perc_loc_sim/100)

#basically want to merge this in with the counterfactual winners to make the adjustment: 
#also going to track and scale the valuations appropriately for the counterfactual table ! 

sim_cf_win <- df_shortlist_cf_win %>% group_by(id_deal, stateabbrev, winner) %>%  
  summarize(no_obs_cf = n(), mean_v = mean(v1), mean_pi = mean(profits_rand_eta))
sim_location_cf <- merge(sim_cf_win , sim_location_id, by=c("id_deal", "stateabbrev"), all.x=TRUE) 
#make sure each deal has the winning location as an option, and no need for adjustment for winners: 
sim_location_cf <- merge(sim_location_cf, win_loc, by=c("id_deal", "stateabbrev", "winner"), all.x=TRUE, all.y=TRUE) %>%
  mutate(adjustment = ifelse(winner==1,1,adjustment)) 
#if missing in model simulation then no need for adjustment:
sim_location_cf$adjustment[is.na(sim_location_cf$adjustment)] <- 1

sim_location_cf_summary <- sim_location_cf %>%  mutate(no_obs_cf_adj = adjustment*no_obs_cf) %>%
  group_by(id_deal, winner) %>% arrange(id_deal, desc(winner)) %>% 
  summarize(sum_adj = sum(no_obs_cf_adj), mean_v = mean(mean_v), mean_pi = mean(mean_pi)) 
share_winner <- sim_location_cf_summary %>% arrange(id_deal, desc(winner)) %>% 
  mutate(share = 100*(N_sim-lead(sum_adj))/N_sim) %>% filter(row_number()==1)

share_winner$share[is.na(share_winner$share)] <- 100  
share_winner <- merge(share_winner, win_loc, by=c("id_deal", "winner")) %>% select(-c("sum_adj"))

#now will append with the runners-up and non-runners-up/winners 
share_other <- sim_location_cf %>%  mutate(share = 100*adjustment*no_obs_cf/N_sim) %>% filter(winner==0) %>% select(-c("winner"))
share_other <- merge(share_other, ru_loc, by=c("id_deal", "stateabbrev"), all.x=TRUE) %>% 
  select(all_of(c("id_deal", "runnerup", "stateabbrev", "share", "mean_v", "mean_pi")))
share_other$runnerup[is.na(share_other$runnerup)] <- 0

cf_loc <- bind_rows(share_winner, share_other) %>% group_by(id_deal) %>% arrange(id_deal)
cf_loc$runnerup[is.na(cf_loc$runnerup)] <- 0
cf_loc$winner[is.na(cf_loc$winner)] <- 0
cf_loc <- cf_loc %>% mutate(mean_v=ifelse(is.na(mean_v),sub_M, mean_v))  %>% 
  mutate(mean_pi=ifelse(is.na(mean_pi),profit_win_rand, mean_pi)) %>% mutate(mean_pi=ifelse(mean_pi==0 & winner==1,profit_win_rand, mean_pi)) %>%
  mutate(adj_v = mean_v*(share/100)) %>%  mutate(adj_pi = mean_pi*(share/100))
write.csv(cf_loc, "data/cf_locations_serv.csv")

#####################################################################################
#FIGURES 
######################################################################################

v_serv <- read_csv("data/Hv_serv.csv")
v_serv <- as.data.frame(v_serv[2])
v_manuf <- read_csv("data/Hv_manuf.csv")
v_manuf <- as.data.frame(v_manuf[2])
w_serv <- read_csv("data/welfare_serv.csv")
w_serv <- as.data.frame(w_serv[2])
w_manuf <- read_csv("data/welfare_manuf.csv")
w_manuf <- as.data.frame(w_manuf[2])

###################################### 
# FIGURE K.2
ggplot() + stat_ecdf(aes(x=w_manuf$x, color="Manufacturing", linetype="Manufacturing"), geom="line", size=1)  + 
  stat_ecdf(aes(x=w_serv$x, color="Trade/Services", linetype="Trade/Services"), geom="line",size=1) +  
  theme(axis.text=element_text(size=12), axis.title=element_text(size=12), legend.text=element_text(size=12)) + 
  scale_color_manual("", values = c("Manufacturing" = "dodgerblue",
                                    "Trade/Services" = "dodgerblue4")) +
  scale_linetype_manual("", values = c("Manufacturing" = "solid",
                                       "Trade/Services" = "dashed")) +
  coord_cartesian(xlim=c(0,1000)) +labs(x="Welfare ($M)", linetype= "", y="")  + 
  theme(legend.position = c(0.8, 0.25)) + guides(fill = guide_legend(keywidth = 1, keyheight = 1),
                                                 linetype=guide_legend(keywidth = 3, keyheight = 1),
                                                 colour=guide_legend(keywidth = 3, keyheight = 1))
ggsave("draft/appendix/ecdf_w.eps")

ggplot() + stat_density(aes(x=w_manuf$x, color="Manufacturing", linetype="Manufacturing"), geom="line", size=1)  + 
  stat_density(aes(x=w_serv$x, color="Trade/Services", linetype="Trade/Services"), geom="line",size=1) +  
  theme(axis.text=element_text(size=12), axis.title=element_text(size=12), legend.text=element_text(size=12)) + 
  scale_color_manual("", values = c("Manufacturing" = "dodgerblue",
                                    "Trade/Services" = "dodgerblue4")) +
  scale_linetype_manual("", values = c("Manufacturing" = "solid",
                                       "Trade/Services" = "dashed")) +
  coord_cartesian(xlim=c(0,1000)) +labs(x="Welfare ($M)", linetype= "", y="")  + 
  theme(legend.position = c(0.8, 0.25)) + guides(fill = guide_legend(keywidth = 1, keyheight = 1),
                                                 linetype=guide_legend(keywidth = 3, keyheight = 1),
                                                 colour=guide_legend(keywidth = 3, keyheight = 1))
ggsave("draft/appendix/epdf_w.eps")

######################################
# FIGURE K.3

ggplot() + stat_ecdf(aes(x=v_manuf$x, color="Manufacturing", linetype="Manufacturing"), geom="line", size=1)  + 
  stat_ecdf(aes(x=v_serv$x, color="Trade/Services", linetype="Trade/Services"), geom="line",size=1) +  
  theme(axis.text=element_text(size=12), axis.title=element_text(size=12), legend.text=element_text(size=12)) + 
  scale_color_manual("", values = c("Manufacturing" = "dodgerblue",
                                    "Trade/Services" = "dodgerblue4")) +
  scale_linetype_manual("", values = c("Manufacturing" = "solid",
                                    "Trade/Services" = "dashed")) +
  coord_cartesian(xlim=c(0,1000)) +labs(x="valuation ($M)", linetype= "", y="")  + 
  theme(legend.position = c(0.8, 0.25)) + guides(fill = guide_legend(keywidth = 1, keyheight = 1),
                                                linetype=guide_legend(keywidth = 3, keyheight = 1),
                                                colour=guide_legend(keywidth = 3, keyheight = 1))
ggsave("draft/appendix/ecdf_v.eps")



ggplot() + stat_density(aes(x=v_manuf$x, color="Manufacturing", linetype="Manufacturing"), geom="line", size=1)  + 
  stat_density(aes(x=v_serv$x, color="Trade/Services", linetype="Trade/Services"), geom="line",size=1) +  
  theme(axis.text=element_text(size=12), axis.title=element_text(size=12), legend.text=element_text(size=12)) + 
  scale_color_manual("", values = c("Manufacturing" = "dodgerblue",
                                    "Trade/Services" = "dodgerblue4")) +
  scale_linetype_manual("", values = c("Manufacturing" = "solid",
                                       "Trade/Services" = "dashed")) +
  coord_cartesian(xlim=c(0,1000)) +labs(x="valuation ($M)", linetype= "", y="")  + 
  theme(legend.position = c(0.8, 0.25)) + guides(fill = guide_legend(keywidth = 1, keyheight = 1),
                                                linetype=guide_legend(keywidth = 3, keyheight = 1),
                                                colour=guide_legend(keywidth = 3, keyheight = 1))
ggsave("draft/appendix/epdf_v.eps")
